

library(tidyverse)
library(readr)
library(fuzzyjoin)
library(car)
library(stringr)
library(maps)
library(data.table)
library(cdlTools)
setwd(rprojroot::find_root(rprojroot::is_rstudio_project))
getwd()
df<-fread("Intermediate Data/house_vote_level_data.csv")
census <- fread("Intermediate Data/Census Data/DistrictCensusData_CCES_070323.csv")
colnames(census)
head(df)

link <- readxl::read_xlsx("Intermediate Data/policy_outcomes_and_notes.xlsx")
link<-link[!is.na(link$data_num),]
house <- link[c("variable_name_house","data_num", "data_year", "vote_before_cces", "vote_after_cces")]
house$vote <- paste0(house$variable_name_house, house$data_year)
senate <- link[c("variable_name_senate","data_num", "data_year", "vote_before_cces", "vote_after_cces")]
senate$vote <- paste0(senate$variable_name_senate, senate$data_year)
house_only <- subset(house, !house$variable_name_house%in%senate$variable_name_senate)
colnames(house_only) <- colnames(senate)
  
df$year<-str_sub(df$vote,-4,-1)
df$congress <- dplyr::recode(df$year, 
                             "2006" = "109th",
                             "2007" = "110th",
                             "2008" = "110th",
                             "2009" = "111th",
       "2010" = "111th",
       "2011" = "112th",
       "2012" = "112th",
       "2013" = "113th",
       "2014" = "113th",
       "2015" = "114th",
       "2016" = "114th",
       "2017" = "115th",
       "2018" = "115th",
       "2019" = "116th",
       "2020" = "116th",
       "2021" = "117th",
       "2022" = "117th")
df$state_ab<-fips(df$state, to="Abbreviation")
census$YEAR <- as.character(census$YEAR)
census$cd <- str_sub(census$NAME_E,24,25)
census$cd <- ifelse(census$cd=="(a", "1", census$cd)
census$cd <- as.numeric(gsub("([0-9]+).*$", "\\1", census$cd))
census$congress <- ifelse(census$YEAR=="2006", "109th", census$congress)
#census$STATE <-
df<-left_join(df, census, by=c("state_ab"="STUSAB", "congress"="congress", "cd" = "cd"))#, "year"="YEAR"))
df <-left_join(df, house, by="vote")
sum(is.na(df$d_p_black))
rm(census)

mcs <- fread("Intermediate Data/HSall_members.csv")
mcs <- subset(mcs, mcs$congress>108)
mcs$congress <- paste0(mcs$congress, "th")
#randomly drop duplicate icpsr codes
mcs<-mcs %>% 
  group_by(icpsr, congress) %>%
  sample_n(1)
df<-left_join(df, mcs, by=c("MC" = "icpsr", "congress" = "congress"))

write_csv2(df, "Data/house_vote_level_data_census.csv")
rm(df, house)

x<-fread("Intermediate Data/senate_vote_level_data.csv", fill=T) %>% select(-c("year.x")) %>% rename(year = year.y)
y <- fread("Intermediate Data/senate_vote_level_data_missingvotes.csv", fill=T)


y<-y %>% select("X", "total", "dat_vote_ID.y", "outcome", "sen", 
                "state", "cd", "fips", "female", 
                "race", "edu", "dob", "ideo", 
                "income", "pid7", "reg_voter", "donate_any", 
                "voter_ver", "vv_matched", "house_vote", "zip", 
                "donate_amount", "church_attendance", "prayer", "importance", "weight", "pres_vote", "religion", "voter_status", "dat_vote_ID", "year.y", 
                "resp_vote_ID", "respondent_view") %>% mutate(dat_vote_ID.y = paste0(dat_vote_ID.y, "_3"),
                                                              outcome = NA, total = NA) %>%
  rename_with(~colnames(x))

x<-rbind(x, y)
census <- fread("Intermediate Data/Census Data/StateCensusData_CCES_070323.csv", fill=T)
x$year<-str_sub(x$vote,-6,-3)
x$state_ab<-fips(x$state, to="Abbreviation")
census$YEAR <- as.character(census$YEAR)
census$state_ab <- fips(census$STATE, to="Abbreviation")
census$state_ab[census$STATE == "North Carolina"] <- "NC"
census$state_ab[census$STATE == "North Dakota"] <- "ND"
census$state_ab[census$STATE == "New Hampshire"] <- "NH"
census$state_ab[census$STATE == "New Jersey"] <- "NJ"
census$state_ab[census$STATE == "New Mexico"] <- "NM"
census$state_ab[census$STATE == "New York"] <- "NY"
census$state_ab[census$STATE == "Rhode Island"] <- "RI"
census$state_ab[census$STATE == "South Carolina"] <- "SC"
census$state_ab[census$STATE == "South Dakota"] <- "SD"
census$state_ab[census$STATE == "West Virginia"] <- "WV"


x<-left_join(x, census, by=c("state_ab"="state_ab", "year"="YEAR"))

x$merge_failed <- is.na(x$s_medinc)
df<-subset(x, x$merge_failed==T)

rm(census, df)

mcs <- fread("Intermediate Data/HSall_members.csv")
mcs <- subset(mcs, mcs$congress>108)
mcs$congress <- paste0(mcs$congress, "th")
#randomly drop duplicate icpsr codes
mcs<-mcs %>% 
  group_by(icpsr, congress) %>%
  sample_n(1)
x$congress <- dplyr::recode(x$year, "2006" = "109th",
                            "2007" = "110th",
                            "2008" = "110th",
                            "2009" = "111th",
                            "2010" = "111th",
                            "2011" = "112th",
                            "2012" = "112th",
                            "2013" = "113th",
                            "2014" = "113th",
                            "2015" = "114th",
                            "2016" = "114th",
                            "2017" = "115th",
                            "2018" = "115th",
                            "2019" = "116th",
                            "2020" = "116th",
                            "2021" = "117th",
                            "2022" = "117th")
x<-left_join(x, mcs, by=c("Sen" = "icpsr", "congress" = "congress"))
x$vote_merge <- substr(x$vote, 1, regexpr("_", x$vote) - 1)

x <-left_join(x, rbind(senate, house_only), by=c("vote_merge"="vote"))


write_csv2(x, "Data/senate_vote_level_data_census.csv")
rm(x, y, senate, house_only)

